import pandas as pd
import numpy as np
from matplotlib import gridspec
import matplotlib.pyplot as plt
from rdkit import Chem, DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import Draw
from scipy.cluster.hierarchy import dendrogram, linkage
import plotly.express as px
import umap
import ibis
ibis.set_backend("duckdb")
ibis.options.interactive = True
from ibis import _
import ibis.selectors as s
import warnings
warnings.filterwarnings('ignore')
/Users/jordanramsdell/mambaforge/envs/ml_ibis/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
t_targets = ibis.read_parquet("../../../data/open_targets/targets/")
t_targets
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ approvedSymbol ┃ biotype ┃ transcriptIds ┃ canonicalTranscript ┃ canonicalExons ┃ genomicLocation ┃ alternativeGenes ┃ approvedName ┃ go ┃ hallmarks ┃ synonyms ┃ symbolSynonyms ┃ nameSynonyms ┃ functionDescriptions ┃ subcellularLocations ┃ targetClass ┃ obsoleteSymbols ┃ obsoleteNames ┃ constraint ┃ tep ┃ proteinIds ┃ dbXrefs ┃ chemicalProbes ┃ homologues ┃ tractability ┃ safetyLiabilities ┃ pathways ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ string │ array<string> │ struct<id: string, chromosome: string, start: int64, end: int64, strand: string> │ array<string> │ struct<chromosome: string, start: int64, end: int64, strand: int32> │ array<string> │ string │ array<struct<id: string, source: string, evidence: string, aspect: string, gene… │ struct<a… │ array<struct<label: string, source: string>> │ array<struct<label: string, source: string>> │ array<struct<label: string, source: string>> │ array<string> │ array<struct<location: string, source: string, termSL: string, labelSL: string>> │ array<struct<id: int64, label: string, level: string>> │ array<struct<label: string, source: string>> │ array<struct<label: string, source: string>> │ array<struct<constraintType: string, score: float32, exp: float32, obs: int32, … │ str… │ array<struct<id: string, source: string>> │ array<struct<id: string, source: string>> │ array<struct<… │ array<struct<speciesId: string, speciesName: string, homologyType: string, targ… │ array<struct<modality: string, id: string, value: boolean>> │ array<struct<event: string, eventId: string, effects: array<struct<direction: s… │ array<struct<pathwayId: string, pathway: string, topLevelTerm: string>> │ ├─────────────────┼────────────────┼──────────────────────┼─────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────┼─────────────────────────────────────────────────────────────────────┼─────────────────────┼─────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼───────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────┼───────────────────────────────────────────┼───────────────────────────────────────────┼────────────────┼──────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────────────────────────────────┤ │ ENSG00000020219 │ CCT8L1P │ processed_pseudogene │ ['ENST00000465400'] │ {'id': 'ENST00000465400', 'chromosome': '7', ... +3} │ ['152445477', '152447150'] │ {'chromosome': '7', 'start': 152445477, ... +2} │ NULL │ chaperonin containing TCP1 subunit 8 like 1, pseudogene │ NULL │ NULL │ [{...}, {...}, ... +4] │ [{...}, {...}, ... +2] │ [{...}, {...}] │ ['Possible molecular chaperone; assists the folding of proteins upon ATP hydrolysi'+4] │ [{...}] │ NULL │ [{...}] │ [{...}, {...}] │ NULL │ NULL │ [{...}] │ [{...}, {...}, ... +4] │ NULL │ NULL │ NULL │ NULL │ NULL │ │ ENSG00000059588 │ TARBP1 │ protein_coding │ ['ENST00000496673', 'ENST00000483404', ... +7] │ {'id': 'ENST00000040877', 'chromosome': '1', ... +3} │ ['234420702', '234420812', ... +58] │ {'chromosome': '1', 'start': 234391313, ... +2} │ NULL │ TAR (HIV-1) RNA binding protein 1 │ [{...}, {...}, ... +3] │ NULL │ [{...}, {...}, ... +20] │ [{...}, {...}, ... +10] │ [{...}, {...}, ... +8] │ ['Probable S-adenosyl-L-methionine-dependent methyltransferase which methylates RN'+28, '(Microbial infection) In case of infection by HIV-1, it binds to the loop region'+444] │ [{...}] │ [{...}] │ [] │ [{...}] │ [{...}, {...}, ... +1] │ NULL │ [{...}, {...}] │ [{...}, {...}, ... +8] │ NULL │ [{...}, {...}, ... +11] │ [{...}, {...}, ... +26] │ NULL │ NULL │ │ ENSG00000070182 │ SPTB │ protein_coding │ ['ENST00000553938', 'ENST00000389720', ... +5] │ {'id': 'ENST00000644917', 'chromosome': '14', ... +3} │ ['64785537', '64785627', ... +70] │ {'chromosome': '14', 'start': 64746283, ... +2} │ NULL │ spectrin beta, erythrocytic │ [{...}, {...}, ... +43] │ NULL │ [{...}, {...}, ... +15] │ [{...}, {...}, ... +5] │ [{...}, {...}, ... +8] │ ['Spectrin is the major constituent of the cytoskeletal network underlying the ery'+139] │ [{...}, {...}] │ NULL │ [] │ [] │ [{...}, {...}, ... +1] │ NULL │ [{...}, {...}, ... +2] │ [{...}, {...}, ... +15] │ NULL │ [{...}, {...}, ... +22] │ [{...}, {...}, ... +26] │ NULL │ [{...}, {...}, ... +2] │ │ ENSG00000070366 │ SMG6 │ protein_coding │ ['ENST00000354901', 'ENST00000570756', ... +18] │ {'id': 'ENST00000263073', 'chromosome': '17', ... +3} │ ['2172658', '2172859', ... +36] │ {'chromosome': '17', 'start': 2059839, ... +2} │ NULL │ SMG6 nonsense mediated mRNA decay factor │ [{...}, {...}, ... +47] │ NULL │ [{...}, {...}, ... +23] │ [{...}, {...}, ... +11] │ [{...}, {...}, ... +10] │ ['Component of the telomerase ribonucleoprotein (RNP) complex that is essential fo'+685, 'Plays a role in nonsense-mediated mRNA decay (PubMed:18974281, PubMed:19060897, '+586] │ [{...}, {...}, ... +3] │ NULL │ [{...}] │ [{...}, {...}, ... +2] │ [{...}, {...}, ... +1] │ NULL │ [{...}, {...}, ... +12] │ [{...}, {...}, ... +13] │ NULL │ [{...}, {...}, ... +10] │ [{...}, {...}, ... +26] │ NULL │ [{...}] │ │ ENSG00000072071 │ ADGRL1 │ protein_coding │ ['ENST00000361434', 'ENST00000589616', ... +6] │ {'id': 'ENST00000361434', 'chromosome': '19', ... +3} │ ['14160112', '14160297', ... +44] │ {'chromosome': '19', 'start': 14147743, ... +2} │ ['ENSG00000288324'] │ adhesion G protein-coupled receptor L1 │ [{...}, {...}, ... +21] │ NULL │ [{...}, {...}, ... +20] │ [{...}, {...}, ... +11] │ [{...}, {...}, ... +7] │ ['Calcium-independent receptor of high affinity for alpha- latrotoxin, an excitato'+308] │ [{...}, {...}, ... +2] │ NULL │ [{...}] │ [{...}] │ [{...}, {...}, ... +1] │ NULL │ [{...}, {...}, ... +5] │ [{...}, {...}, ... +14] │ NULL │ [{...}, {...}, ... +18] │ [{...}, {...}, ... +26] │ NULL │ NULL │ │ ENSG00000073536 │ NLE1 │ protein_coding │ ['ENST00000589367', 'ENST00000360831', ... +5] │ {'id': 'ENST00000442241', 'chromosome': '17', ... +3} │ ['35133339', '35133498', ... +24] │ {'chromosome': '17', 'start': 35128730, ... +2} │ NULL │ notchless homolog 1 │ [{...}, {...}, ... +12] │ NULL │ [{...}, {...}, ... +7] │ [{...}, {...}, ... +3] │ [{...}, {...}, ... +2] │ ['Plays a role in regulating Notch activity. Plays a role in regulating the expres'+176] │ [{...}, {...}, ... +1] │ NULL │ [{...}] │ [{...}] │ [{...}, {...}, ... +1] │ NULL │ [{...}, {...}, ... +6] │ [{...}, {...}, ... +8] │ NULL │ [{...}, {...}, ... +10] │ [{...}, {...}, ... +26] │ NULL │ NULL │ │ ENSG00000075290 │ WNT8B │ protein_coding │ ['ENST00000343737'] │ {'id': 'ENST00000343737', 'chromosome': '10', ... +3} │ ['100479874', '100480012', ... +10] │ {'chromosome': '10', 'start': 100463009, ... +2} │ NULL │ Wnt family member 8B │ [{...}, {...}, ... +18] │ NULL │ [{...}, {...}, ... +4] │ [{...}, {...}] │ [{...}, {...}, ... +2] │ ['Ligand for members of the frizzled family of seven transmembrane receptors. May '+121] │ [{...}] │ NULL │ [] │ [{...}] │ [{...}, {...}, ... +1] │ NULL │ [{...}, {...}, ... +4] │ [{...}, {...}, ... +9] │ NULL │ [{...}, {...}, ... +14] │ [{...}, {...}, ... +26] │ NULL │ [{...}, {...}, ... +2] │ │ ENSG00000083454 │ P2RX5 │ protein_coding │ ['ENST00000552276', 'ENST00000551178', ... +10] │ {'id': 'ENST00000225328', 'chromosome': '17', ... +3} │ ['3695869', '3696155', ... +22] │ {'chromosome': '17', 'start': 3672199, ... +2} │ NULL │ purinergic receptor P2X 5 │ [{...}, {...}, ... +18] │ NULL │ [{...}, {...}, ... +14] │ [{...}, {...}, ... +6] │ [{...}, {...}, ... +6] │ ['Receptor for ATP that acts as a ligand-gated ion channel.'] │ [{...}, {...}] │ [{...}, {...}, ... +1] │ [] │ [{...}] │ [{...}, {...}, ... +1] │ NULL │ [{...}, {...}, ... +11] │ [{...}, {...}, ... +6] │ NULL │ [{...}, {...}, ... +11] │ [{...}, {...}, ... +26] │ [{...}, {...}, ... +6] │ [{...}, {...}] │ │ ENSG00000083782 │ EPYC │ protein_coding │ ['ENST00000261172', 'ENST00000551767', ... +1] │ {'id': 'ENST00000261172', 'chromosome': '12', ... +3} │ ['90970044', '90970139', ... +12] │ {'chromosome': '12', 'start': 90963682, ... +2} │ NULL │ epiphycan │ [{...}, {...}, ... +5] │ NULL │ [{...}, {...}, ... +20] │ [{...}, {...}, ... +10] │ [{...}, {...}, ... +8] │ ['May have a role in bone formation and also in establishing the ordered structure'+42] │ [{...}] │ NULL │ [{...}] │ [{...}, {...}] │ [{...}, {...}, ... +1] │ NULL │ [{...}, {...}, ... +3] │ [{...}, {...}, ... +6] │ NULL │ [{...}, {...}, ... +13] │ [{...}, {...}, ... +26] │ NULL │ NULL │ │ ENSG00000086200 │ IPO11 │ protein_coding │ ['ENST00000409296', 'ENST00000506905', ... +12] │ {'id': 'ENST00000325324', 'chromosome': '5', ... +3} │ ['62442983', '62443083', ... +58] │ {'chromosome': '5', 'start': 62403972, ... +2} │ NULL │ importin 11 │ [{...}, {...}, ... +7] │ NULL │ [{...}, {...}, ... +12] │ [{...}, {...}, ... +6] │ [{...}, {...}, ... +4] │ ['Functions in nuclear protein import as nuclear transport receptor. Serves as rec'+807] │ [{...}, {...}, ... +2] │ NULL │ [] │ [] │ [{...}, {...}, ... +1] │ NULL │ [{...}, {...}, ... +12] │ [{...}, {...}, ... +4] │ NULL │ [{...}, {...}, ... +14] │ [{...}, {...}, ... +26] │ NULL │ NULL │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────────────┴────────────────┴──────────────────────┴─────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────┴─────────────────────────────────────────────────────────────────────┴─────────────────────┴─────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴───────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────┴───────────────────────────────────────────┴───────────────────────────────────────────┴────────────────┴──────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────────────────────────┘
t_targets.subcellularLocations
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ subcellularLocations ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ array<struct<location: string, source: string, termSL: string, labelSL: string>> │ ├──────────────────────────────────────────────────────────────────────────────────┤ │ [{...}] │ │ [{...}] │ │ [{...}, {...}] │ │ [{...}, {...}, ... +3] │ │ [{...}, {...}, ... +2] │ │ [{...}, {...}, ... +1] │ │ [{...}] │ │ [{...}, {...}] │ │ [{...}] │ │ [{...}, {...}, ... +2] │ │ … │ └──────────────────────────────────────────────────────────────────────────────────┘
(t_targets
.select("id", _.subcellularLocations.unnest()) # "_" references output (table) from previous call; works with chaining
)
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ subcellularLocations ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ struct<location: string, source: string, termSL: string, labelSL: string> │ ├─────────────────┼───────────────────────────────────────────────────────────────────────────┤ │ ENSG00000020219 │ {'location': 'Cytoplasm', 'source': 'uniprot', ... +2} │ │ ENSG00000059588 │ {'location': 'Nuclear speckles', 'source': 'HPA_main', ... +2} │ │ ENSG00000070182 │ {'location': 'Cytoplasm', 'source': 'uniprot', ... +2} │ │ ENSG00000070182 │ {'location': 'Cytosol', 'source': 'HPA_main', ... +2} │ │ ENSG00000070366 │ {'location': 'Nucleus', 'source': 'uniprot', ... +2} │ │ ENSG00000070366 │ {'location': 'Chromosome', 'source': 'uniprot', ... +2} │ │ ENSG00000070366 │ {'location': 'Cytoplasm', 'source': 'uniprot', ... +2} │ │ ENSG00000070366 │ {'location': 'Nucleoli', 'source': 'HPA_main', ... +2} │ │ ENSG00000070366 │ {'location': 'Cytosol', 'source': 'HPA_additional', ... +2} │ │ ENSG00000072071 │ {'location': 'Cell membrane', 'source': 'uniprot', ... +2} │ │ … │ … │ └─────────────────┴───────────────────────────────────────────────────────────────────────────┘
(t_targets
.select("id", _.subcellularLocations.unnest())
.unpack("subcellularLocations")
)
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ location ┃ source ┃ termSL ┃ labelSL ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ string │ string │ string │ ├─────────────────┼──────────────────┼────────────────┼─────────┼────────────────────┤ │ ENSG00000020219 │ Cytoplasm │ uniprot │ SL-0086 │ Cellular component │ │ ENSG00000059588 │ Nuclear speckles │ HPA_main │ SL-0186 │ Nucleus speckle │ │ ENSG00000070182 │ Cytoplasm │ uniprot │ SL-0086 │ Cellular component │ │ ENSG00000070182 │ Cytosol │ HPA_main │ SL-0091 │ Cytosol │ │ ENSG00000070366 │ Nucleus │ uniprot │ SL-0191 │ Cellular component │ │ ENSG00000070366 │ Chromosome │ uniprot │ SL-0468 │ Cellular component │ │ ENSG00000070366 │ Cytoplasm │ uniprot │ SL-0086 │ Cellular component │ │ ENSG00000070366 │ Nucleoli │ HPA_main │ SL-0188 │ Nucleolus │ │ ENSG00000070366 │ Cytosol │ HPA_additional │ SL-0091 │ Cytosol │ │ ENSG00000072071 │ Cell membrane │ uniprot │ SL-0039 │ Cellular component │ │ … │ … │ … │ … │ … │ └─────────────────┴──────────────────┴────────────────┴─────────┴────────────────────┘
str(t_targets
.select("id", _.subcellularLocations.unnest())
.unpack("subcellularLocations")
.compile()
)
'SELECT t0.id, struct_extract(t0."subcellularLocations", \'location\') AS location, struct_extract(t0."subcellularLocations", \'source\') AS source, struct_extract(t0."subcellularLocations", \'termSL\') AS "termSL", struct_extract(t0."subcellularLocations", \'labelSL\') AS "labelSL" \nFROM (SELECT t1.id AS id, unnest(t1."subcellularLocations") AS "subcellularLocations" \nFROM _ibis_read_parquet_9bs60q265u7upy5bflh2c30wb AS t1) AS t0'
t_molecule = ibis.read_parquet("../../../data/open_targets/molecule/")
t_molecule
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ canonicalSmiles ┃ inchiKey ┃ drugType ┃ blackBoxWarning ┃ name ┃ yearOfFirstApproval ┃ maximumClinicalTrialPhase ┃ parentId ┃ hasBeenWithdrawn ┃ isApproved ┃ withdrawnNotice ┃ tradeNames ┃ synonyms ┃ crossReferences ┃ childChemblIds ┃ linkedDiseases ┃ linkedTargets ┃ description ┃ ┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ string │ string │ boolean │ string │ int64 │ int64 │ string │ boolean │ boolean │ struct<countri… │ array<string> │ array<string> │ map<string, array<string>> │ array<string> │ struct<rows: array<string>, count: int32> │ struct<rows: array<string>, count: int32> │ string │ ├───────────────┼──────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────┼────────────────┼─────────────────┼─────────────────────────────────────┼─────────────────────┼───────────────────────────┼───────────────┼──────────────────┼────────────┼─────────────────┼────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────────────────────┼────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┤ │ CHEMBL110739 │ C[C@]12C[C@H](O)[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C@@H]1CC[C@@H]2C(=O)CO │ OMFXVFTZEKFJBZ-HJTSIMOOSA-N │ Small molecule │ False │ CORTICOSTERONE │ NULL │ 3 │ NULL │ False │ False │ NULL │ [] │ ['11-b,21-Dihydroxypregn-3,20-dione', '17-Deoxycortisol', ... +6] │ {'PubChem': [...], 'Wikipedia': [...], ... +2} │ NULL │ {'rows': [...], 'count': 1} │ NULL │ Small molecule drug with a maximum clinical trial phase of III and has 1 invest… │ │ CHEMBL1195 │ CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC │ CAJIGINSTLKQMM-UHFFFAOYSA-N │ Small molecule │ False │ PROPOXYCAINE │ 1982 │ 4 │ NULL │ False │ True │ NULL │ [] │ ['Propoxycaine'] │ {'PubChem': [...], 'Wikipedia': [...], ... +2} │ ['CHEMBL1769'] │ NULL │ {'rows': [...], 'count': 10} │ Small molecule drug with a maximum clinical trial phase of IV that was first ap… │ │ CHEMBL1200632 │ CCCCCCCCCCCCCCCC(=O)O[C@@H]1[C@@H](O)[C@@H](O)[C@@H]([C@H](NC(=O)[C@@H]2C[C@@H]… │ GTNDZRUWKHDICY-DJHAJVGHSA-N │ Small molecule │ True │ CLINDAMYCIN PALMITATE HYDROCHLORIDE │ 1986 │ 4 │ CHEMBL1201289 │ False │ True │ NULL │ ['Cleocin', 'Clindamycin palmitate hydrochloride'] │ ['Clindamycin palmitate hcl', 'Clindamycin palmitate hydrochloride', ... +4] │ {'DailyMed': [...], 'PubChem': [...]} │ NULL │ {'rows': [...], 'count': 3} │ {'rows': [...], 'count': 0} │ Small molecule drug with a maximum clinical trial phase of IV (across all indic… │ │ CHEMBL1200691 │ CC(=O)[O-].CC(=O)[O-].[Mg+2] │ UEGPKNKPLBYCNK-UHFFFAOYSA-L │ Small molecule │ False │ MAGNESIUM ACETATE │ NULL │ 4 │ NULL │ False │ True │ NULL │ [] │ ['Acetic acid, magnesium salt', 'Magnesium acetate', ... +3] │ {'DailyMed': [...], 'DrugCentral': [...], ... +4} │ ['CHEMBL3989858'] │ NULL │ NULL │ Small molecule drug with a maximum clinical trial phase of IV. │ │ CHEMBL1201042 │ CC(O)(P(=O)([O-])O)P(=O)([O-])O.[Na+].[Na+] │ GWBBVOVXJZATQQ-UHFFFAOYSA-L │ Small molecule │ False │ ETIDRONATE DISODIUM │ 1977 │ 4 │ CHEMBL871 │ False │ True │ NULL │ ['Didronel', 'Didronel iv', ... +2] │ ['Disodium etidronate', 'Etidronate disodium', ... +3] │ {'PubChem': [...], 'chEBI': [...]} │ NULL │ NULL │ {'rows': [...], 'count': 0} │ Small molecule drug with a maximum clinical trial phase of IV that was first ap… │ │ CHEMBL121790 │ Cc1cccc(-c2nn3c(c2-c2ccc(F)cc2)CCC3)n1 │ NBDZLUOYAAVYHF-UHFFFAOYSA-N │ Small molecule │ False │ CHEMBL121790 │ NULL │ 0 │ NULL │ False │ False │ NULL │ [] │ [] │ {'drugbank': [...]} │ NULL │ NULL │ NULL │ Small molecule drug with a maximum clinical trial phase of I. │ │ CHEMBL1231 │ CCN(CC)CC#CCOC(=O)C(O)(c1ccccc1)C1CCCCC1 │ XIQVNETUBQGFHX-UHFFFAOYSA-N │ Small molecule │ False │ OXYBUTYNIN │ 1975 │ 4 │ NULL │ False │ True │ NULL │ ['Anturol', 'Contimin 2.5', ... +11] │ ['Ditropan', 'Oxybutynin'] │ {'DailyMed': [...], 'PubChem': [...], ... +3} │ ['CHEMBL1133'] │ {'rows': [...], 'count': 15} │ {'rows': [...], 'count': 2} │ Small molecule drug with a maximum clinical trial phase of IV (across all indic… │ │ CHEMBL1231592 │ CC(C)CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC=C4C[C@@H](OS(=O)(=O)O)CC[C@]4(C)[C@H]3… │ BHYOQNUELFTYRT-DPAQBDIFSA-N │ Small molecule │ False │ CHEMBL1231592 │ NULL │ 0 │ NULL │ False │ False │ NULL │ [] │ [] │ {'drugbank': [...], 'chEBI': [...]} │ ['CHEMBL4475544'] │ NULL │ NULL │ Small molecule drug with a maximum clinical trial phase of I. │ │ CHEMBL1232182 │ OCc1cccc(F)c1F │ JSFGDUIJQWWBGY-UHFFFAOYSA-N │ Small molecule │ False │ 2,3-Difluorobenzyl Alcohol │ NULL │ 0 │ NULL │ False │ False │ NULL │ [] │ ['2,3-Difluorobenzyl Alcohol'] │ {'drugbank': [...], 'chEBI': [...]} │ NULL │ NULL │ NULL │ Small molecule drug with a maximum clinical trial phase of I. │ │ CHEMBL1233511 │ O=P(O)(O)O[C@H]1[C@H](OP(=O)(O)O)[C@@H](OP(=O)(O)O)[C@H](OP(=O)(O)O)[C@@H](OP(=… │ IMQLKJBTEOYOSI-GPIVLXJGSA-N │ Small molecule │ False │ PHYTIC ACID │ NULL │ 3 │ NULL │ False │ False │ NULL │ [] │ ['Alkalovert', 'Dermofeel pa-3', ... +9] │ {'drugbank': [...], 'chEBI': [...]} │ ['CHEMBL3989600', 'CHEMBL2106435'] │ {'rows': [...], 'count': 1} │ NULL │ Small molecule drug with a maximum clinical trial phase of III and has 1 invest… │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └───────────────┴──────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────┴────────────────┴─────────────────┴─────────────────────────────────────┴─────────────────────┴───────────────────────────┴───────────────┴──────────────────┴────────────┴─────────────────┴────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────────────────────┴────────────────────────────────────┴───────────────────────────────────────────┴───────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┘
t_mechanismOfAction = ibis.read_parquet("../../../data/open_targets/mechanismOfAction/")
t_mechanismOfAction
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ actionType ┃ mechanismOfAction ┃ chemblIds ┃ targetName ┃ targetType ┃ targets ┃ references ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ array<string> │ string │ string │ array<string> │ array<struct<source: string, ids: array<string>, urls: array<string>>> │ ├───────────────────────────────┼─────────────────────────────────────────────────────────────┼────────────────────────────────────┼──────────────────────────────────────────────┼───────────────────────┼────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────┤ │ ANTAGONIST │ Orexin receptor 2 antagonist │ ['CHEMBL3545367'] │ Orexin receptor 2 │ single protein │ ['ENSG00000137252'] │ [{...}] │ │ ANTAGONIST │ Gonadotropin-releasing hormone receptor antagonist │ ['CHEMBL2028987', 'CHEMBL415606'] │ Gonadotropin-releasing hormone receptor │ single protein │ ['ENSG00000109163'] │ [{...}] │ │ INHIBITOR │ Topoisomerase IV │ ['CHEMBL8'] │ Topoisomerase IV │ protein complex │ [] │ [{...}] │ │ INHIBITOR │ DNA polymerase/reverse transcriptase inhibitor │ ['CHEMBL1652128'] │ DNA polymerase/reverse transcriptase │ single protein │ [] │ [{...}] │ │ AGONIST │ Insulin receptor agonist │ ['CHEMBL1201664'] │ Insulin receptor │ single protein │ ['ENSG00000171105'] │ [{...}] │ │ INHIBITOR │ Hepatitis A virus cellular receptor 2 inhibitor │ ['CHEMBL4298123'] │ Hepatitis A virus cellular receptor 2 │ single protein │ ['ENSG00000135077'] │ [{...}, {...}] │ │ ANTAGONIST │ Muscle-type nicotinic acetylcholine receptor antagonist │ ['CHEMBL1200549', 'CHEMBL1201352'] │ Muscle-type nicotinic acetylcholine receptor │ protein complex group │ ['ENSG00000138435', 'ENSG00000196811', ... +3] │ [{...}, {...}] │ │ ANTAGONIST │ Endothelin receptor ET-A antagonist │ ['CHEMBL23261'] │ Endothelin receptor ET-A │ single protein │ ['ENSG00000151617'] │ [{...}] │ │ INHIBITOR │ Poly [ADP-ribose] polymerase-1 inhibitor │ ['CHEMBL3137318', 'CHEMBL3137320'] │ Poly [ADP-ribose] polymerase-1 │ single protein │ ['ENSG00000143799'] │ [{...}] │ │ POSITIVE ALLOSTERIC MODULATOR │ GABA receptor alpha-3 subunit positive allosteric modulator │ ['CHEMBL1783256'] │ GABA receptor alpha-3 subunit │ single protein │ ['ENSG00000011677'] │ [{...}] │ │ … │ … │ … │ … │ … │ … │ … │ └───────────────────────────────┴─────────────────────────────────────────────────────────────┴────────────────────────────────────┴──────────────────────────────────────────────┴───────────────────────┴────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────┘
molecules_with_labels = (t_mechanismOfAction
# We're going to use these as labels, so make sure they're not null
.dropna(_.actionType)
# We're going to unnest chemblIds and change the name (to match t_molecule)
.select(_.actionType, id = _.chemblIds.unnest())
# For this demo, just considering the first acitonType associated with a drug
.group_by(_.id)
.agg(actionType = _.actionType.first())
# Now we can join with the molecule table and make sure there are SMILES strings
.inner_join(t_molecule, "id")
.dropna("canonicalSmiles")
)
print("Before Filter: {}".format(t_molecule.count()))
print("After Filter: {}".format(molecules_with_labels.count()))
Before Filter: 12854
After Filter: 4126
# Pulling table into a Pandas Dataframe
df = (molecules_with_labels
.select("id", "actionType", "name", "canonicalSmiles")
.execute())
# Using RDKIT to create molecular fingerprints from smiles
mols = []
for (mol_name, mol_smile) in zip(df["name"], df["canonicalSmiles"]):
mol = Chem.MolFromSmiles(mol_smile)
mol.SetProp('_Name', mol_name)
mols.append(mol)
# Let's visualize some of these molecules
Draw.MolsToGridImage(mols, molsPerRow=10,
subImgSize=(150,150), maxMols=20,
legends=[mol.GetProp('_Name') for mol in mols])
fingerprints = [FingerprintMols.FingerprintMol(mol) for mol in mols]
sim_matrix = []
for f1 in fingerprints:
sim_row = []
for f2 in fingerprints:
# should be symmetric, so could have just done upper triangle
sim_row.append(DataStructs.FingerprintSimilarity(f1, f2))
sim_matrix.append(sim_row)
sim_matrix = np.asarray(sim_matrix)
sub_mat = sim_matrix[0:40].T[0:40].T
config = dict(scrollZoom=True, doubleClick='reset')
heatmap = px.imshow(sub_mat, x=df["name"][0:40], y=df["name"][0:40])
heatmap.show(config=config)
import plotly.figure_factory as ff
labels = [mol.GetProp('_Name') for mol in mols[0:40]]
dendro_fig = ff.create_dendrogram(sub_mat, labels=labels, orientation='left')
config = dict(scrollZoom=True, doubleClick='reset')
dendro_fig.show(config=config)
embeddings = umap.UMAP(n_components=3).fit_transform(sim_matrix).T
df["x"], df["y"], df["z"] = embeddings
df["index"] = list(range(len(df)))
fig = px.scatter_3d(df, x="x", y="y", z="z",
hover_name="name", hover_data=["index"], color="actionType")
fig.update_traces(marker=dict(size=2))
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
fig.show()